# vim: set noet ts=4:
#
# scim-python
#
# Copyright (c) 2007-2008 Huang Peng <shawn.p.huang@gmail.com>
#
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330,
# Boston, MA  02111-1307  USA
#
# $Id: $
#
import sys, os, re
import bz2

try:
	import PYUtil
	import PYSQLiteDB
except:
	path = os.path.dirname (__file__)
	path = os.path.join (path, "..")
	path = os.path.abspath (path)
	sys.path.append (path)
	import PYUtil
	import PYSQLiteDB


def load_qq_phrases (filename, hanzi_dic, sogou_phrase = {}):
	bzf = bz2.BZ2File (filename, "r")
	
	def parse_qq_phrase (f):	
		is_pinyin = lambda x: (x >= u"a" and x <= u"z")
		is_hanzi = lambda x: not (is_pinyin (x) or x == u"0")
		for l in f:
			l = unicode (l, "utf8").strip () + "0"
			phrase = []
			pinyin = []
			current_pinyin = None
			state = 0
			for c in l:
				if state == 0: # expect a hanzi
					if is_hanzi (c):
						phrase.append (c)
						state = 1
					else:
						raise Exception (l)
				elif state == 1: # expect a pinyin start or a hanzi
					if is_pinyin (c): # a pinyin start
						current_pinyin = [c]
						state = 2
					elif is_hanzi (c): # an hanzi
						pinyin.append (None)
						phrase.append (c)
					else: # finished
						pinyin.append (None)
						state = 3
				elif state == 2: # expect pinyin continue or hanzi
					if is_pinyin (c): # pinyin continue
						current_pinyin.append (c)
					elif is_hanzi (c): # hanzi
						pinyin.append (u"".join (current_pinyin))
						current_pinyin = None
						phrase.append (c)
						state = 1
					else: # finished
						pinyin.append (u"".join (current_pinyin))
						state = 3
				else: # finished
					continue
			i = 0
			for hanzi in phrase:
				if pinyin [i] == None:
					pys = hanzi_dic[hanzi]
					if len (pys) != 1:
						raise Exception (l)
					pinyin[i] = pys.keys ()[0]
				else:
					if pinyin[i] not in hanzi_dic[hanzi]:
						yield (u"".join (phrase), None)
						break
				i += 1
			yield (u"".join (phrase), u"'".join (map (str, pinyin)))
	
	i = 1
	try:
		for phrase, pinyin in parse_qq_phrase (bzf):
			if pinyin != None:
				line = u"%s\t%s\t%d" % (phrase, pinyin, sogou_phrase.get (phrase, [0,0])[1])
				print line.encode ("utf8")
			i += 1
	except Exception, e:
		print u"%d : %s" % (i, e.message)
	
def main ():
	srcdir = "."
	if len (sys.argv) == 2:
		srcdir = sys.argv[1]

	# filename = "py.db"
	# try:
	# 	os.unlink (filename)
	# except:
	# 	pass
	# print "Load phrase freq data"
	#	freq_dict = {}
	#	for l in file (os.path.join (srcdir, "SogouLabDic-utf8.dic")):
	#		l = unicode (l, "utf8")
	#		l = re.split (ur"\t+", l)
	#		freq_dict [l[0]] = int (l[1])
	#	
	#	print "Load char freq data"
	#	for l in file (os.path.join (srcdir, "CharFreq-Modern_utf8.txt")):
	#		l = unicode (l, "utf8")
	#		l = re.split (ur"\t+", l)
	#		freq_dict [l[0]] = int (l[2])
	# 
	# print "Create DB"
	# db = PYSQLiteDB.PYSQLiteDB (filename)
	# db.create_tables ()
	# db.init_pinyin_table ()
	# db.init_shengmu_table ()

	print "Load pinyin_table.txt.bz2"
	filename = os.path.join (srcdir, "pinyin_table.txt.bz2")
	bzf = bz2.BZ2File (filename, "r") 
	hanzi_dic = PYUtil.load_pinyin_table (bzf)

	print "Load SogouLabDic-utf8.dic"
	filename = os.path.join (srcdir, "SogouLabDic-utf8.dic")
	sogou_phrase = PYUtil.load_sogou_phrases (file (filename));

	print "Load qq_pinyin_1.0.txt.bz2"
	filename = os.path.join (srcdir, "qq_pinyin_1.0.txt.bz2")
	qq_phrases = load_qq_phrases (filename, hanzi_dic, sogou_phrase)
	

if __name__ == "__main__":
	main ()
